Click code in the right side to see the code
setwd("/data02/Analysis/Projects/2_CPE_Transmission/Kmer_Comparison_ForPlasmidClustering")
library(igraph)
library(ggplot2)
library(circlize)
library(dplyr)
library(ggrepel)
library(kableExtra)
library(InteractiveComplexHeatmap)
library(ComplexHeatmap)
plasmid_similarity_df <- data.table::fread("List_1071_Plasmid_seqs_kmer_comp_Jaccard_v2.tab", header= FALSE, sep = "\t")
#head(plasmid_similarity_df)
colnames(plasmid_similarity_df) <- c("Plasmid1","Plasmid1_length","Plasmid2","Plasmid2_length", "Plasmid_similarity","Plasmid_disimilarity")
Click code in the right side to see the code
# Generate a similarity matrix
simMatrix <- plasmid_similarity_df %>%
# head() %>%
select(Plasmid1, Plasmid2, Plasmid_similarity) %>%
data.table::dcast(Plasmid1~Plasmid2, value.var="Plasmid_similarity") %>%
replace(is.na(.), 1)
simMatrix <- simMatrix %>% tibble::column_to_rownames("Plasmid1")
simMatrix[is.na(simMatrix)] <- 1
p1 <- Heatmap(as.matrix(simMatrix), name = "Plasmid Similarity", show_row_names = FALSE, show_column_names = FALSE)
Closed_CP_Plasmids_metadata_df <- data.table::fread("/data02/Analysis/Projects/2_CPE_Transmission/Reference_Excels/Closed_CP_Plasmids_metadata_07102022.csv", header= TRUE, sep = ",")
Closed_CP_Plasmids_metadata_subsetdf <- Closed_CP_Plasmids_metadata_df %>% select(Plasmid_name_filename,CP_Gene,Hospital,Sample_type,Inc_group_regrouped)
Plas_CPdf <- Closed_CP_Plasmids_metadata_subsetdf %>% select(Plasmid_name_filename,CP_Gene)
Plas_CPdf <- Plas_CPdf %>% tibble::column_to_rownames("Plasmid_name_filename")
p2 <- Heatmap(Plas_CPdf, name = "CP Genes", show_row_names = FALSE)
#p1
#ht_list <- p1 + p2
#ht_list
#htShiny(ht_list)
# col_fun = colorRamp2(c(0, 0.25, 0.5, 0.75, 1), c( "#06d6a0", "#80cfba", "#ffd166", "#eda4b5", "#ef476f"))
# #col_fun
# Heatmap(as.matrix(simMatrix), name = "Plasmid Similarity", show_row_names = FALSE, show_column_names = FALSE,col = col_fun)
# Heatmap(as.matrix(simMatrix), name = "Plasmid Similarity", show_row_names = FALSE, show_column_names = FALSE,col = col_fun, row_km = 7, column_km = 5)
col_fun = colorRamp2(c(0, 0.25, 0.5, 0.75, 0.9, 1), c("#0232de", "#4e6dde", "#ffd166", "#fabb28", "#ef476f", "#e30742"))
Heatmap(as.matrix(simMatrix), name = "Plasmid Similarity", show_row_names = FALSE, show_column_names = FALSE,col = col_fun,
heatmap_legend_param = list(
title = "Plasmid Similarity", at = c(0, 0.25, 0.5, 0.75, 0.9, 1),
labels = c(0, 0.25, 0.5, 0.75, 0.9, 1)
))
# plasmid_similarity_df %>%
# ggplot(aes(x=Plasmid_similarity)) +
# geom_histogram(binwidth=0.02,color="black",fill = "#4dbfbc") +
# scale_x_continuous(breaks = seq(0, 20, 1)) +
# stat_bin(binwidth=0.02, geom="text", aes(label=..count..), vjust=-1.5)+
# labs(x = "Plasmid Similarity (Jaccard)", y = "Plasmid Pair Counts") +
# ggtitle("Histogram of Plasmid Similarity (Bin size 0.02)") +
# theme_classic()
#NOTE: The above geom_histogram did not give me expected counts. So, I calculated ranges first and then tabled and then made the dataframe which was then supplied to ggplot to draw a plot
stack(table(cut(plasmid_similarity_df$Plasmid_similarity,breaks=seq.int(from=0,to=1,by=0.02)))) %>%
as.data.frame() %>%
rename(PlasmidPairCount=values, Range=ind) %>%
ggplot(aes(x=Range,y=PlasmidPairCount, label = PlasmidPairCount)) +
geom_col(color="black",fill = "#4dbfbc",position = "dodge") +
geom_text(position = position_dodge(width = 1), hjust = -0.1) +
labs(x = "Plasmid Similarity (Jaccard)", y = "Plasmid Pair Counts") +
ggtitle("Histogram of Plasmid Similarity (Bin size 0.02)") +
theme_classic() +
theme(axis.text.y=element_text(size=10, vjust = 0, hjust = 1),
axis.text.x=element_text(size=10, angle=90, vjust = 0, hjust = 1.2),
axis.line=element_blank(),
axis.ticks=element_blank(),
legend.position="top") +
coord_flip()
# plasmid_similarity_df %>%
# ggplot(aes(x=Plasmid_similarity)) +
# geom_histogram(binwidth=0.05,color="black",fill = "#4dbfbc") +
# #scale_x_continuous(breaks = seq(0, 20, 1), limits = c(0, 1)) +
# stat_bin(binwidth=0.05, geom="text", aes(label=..count..), vjust=-1.5)+
# labs(x = "Plasmid Similarity (Jaccard)", y = "Plasmid Pair Counts") +
# ggtitle("Histogram of Plasmid Similarity (Bin size 0.05)") +
# theme_classic()
#
# plasmid_similarity_df %>%
# ggplot(aes(x=Plasmid_similarity)) +
# geom_histogram(
# color="black",fill = "#4dbfbc",
# breaks = seq(0, 1, by = 0.05),
# aes(fill = ..count..), position = "identity") +
# #scale_y_log10() +
# scale_x_continuous(breaks = seq(0, 1, by=0.05)) +
# stat_bin(binwidth=0.05, geom="text", aes(label=..count..), vjust=-1.5, hjust=-0.2)+
# labs(x = "Plasmid Similarity (Jaccard)", y = "Plasmid Pair Counts") +
# ggtitle("Histogram of Plasmid Similarity (Bin size 0.05)") +
# theme_classic()
#
# plasmid_similarity_df %>%
# ggplot(aes(x=Plasmid_similarity)) +
# stat_bin(binwidth=0.02) +
# stat_bin(binwidth=0.02, geom="text", aes(label=..count..), vjust=-1.5) +
# scale_x_continuous(breaks = seq(0, 1, by=0.02))
stack(table(cut(plasmid_similarity_df$Plasmid_similarity,breaks=seq.int(from=0,to=1,by=0.05)))) %>%
as.data.frame() %>%
rename(PlasmidPairCount=values, Range=ind) %>%
ggplot(aes(x=Range,y=PlasmidPairCount, label = PlasmidPairCount)) +
geom_col(color="black",fill = "#4dbfbc",position = "dodge") +
geom_text(position = position_dodge(width = 1), hjust = -0.1) +
labs(x = "Plasmid Similarity (Jaccard)", y = "Plasmid Pair Counts") +
ggtitle("Histogram of Plasmid Similarity (Bin size 0.05)") +
theme_classic() +
theme(axis.text.y=element_text(size=10, vjust = 0, hjust = 1),
axis.text.x=element_text(size=10, angle=90, vjust = 0, hjust = 1.2),
axis.line=element_blank(),
axis.ticks=element_blank(),
legend.position="top") +
coord_flip()
plasmid_similarity_df %>%
count(Plasmid_similarity) %>%
mutate(cumsum_n = cumsum(n)) %>%
ggplot(aes(x=Plasmid_similarity, y=cumsum_n)) +
geom_point(color="red", size=0.1) +
theme_classic()
# Cumulative frequency plot from 0.9
plasmid_similarity_df %>%
count(Plasmid_similarity) %>%
mutate(cumsum_n = cumsum(n)) %>%
filter(Plasmid_similarity>=0.97) %>%
ggplot(aes(x=Plasmid_similarity, y=cumsum_n)) +
geom_point(color="red", size=0.1) +
theme_classic()
# Cumulative frequency plot from 0.9
plasmid_similarity_df %>%
count(Plasmid_similarity) %>%
mutate(cumsum_n = cumsum(n)) %>%
filter(Plasmid_similarity>=0.9) %>%
ggplot(aes(x=Plasmid_similarity, y=cumsum_n)) +
geom_point(color="red", size=0.1) +
theme_classic()
# Cumulative frequency plot from 0.9
plasmid_similarity_df %>%
count(Plasmid_similarity) %>%
mutate(cumsum_n = cumsum(n)) %>%
filter(Plasmid_similarity>=0.8) %>%
ggplot(aes(x=Plasmid_similarity, y=cumsum_n)) +
geom_point(color="red", size=0.1) +
theme_classic()
#summary(plasmid_similarity_df)
for(sim_cutoff in seq(0.80, 1, by = 0.01)) {
Plaus_ClustrPairs <- plasmid_similarity_df %>%
filter(Plasmid_similarity >= sim_cutoff) %>%
select(Plasmid1,Plasmid2)
nrow(Plaus_ClustrPairs)
g1 <- graph.data.frame(Plaus_ClustrPairs, directed = FALSE)
g1
#plot(g1)
#Fancy figure start
#n = 100
#p = 1.5/n
#g = erdos.renyi.game(n, p)
#coords = layout.fruchterman.reingold(g)
#plot(g1, layout=coords, vertex.size = 3, vertex.label=NA)
#plot(g1, vertex.label=NA, vertex.size=2, vertex.color="#0CCF02")
#Fancy figure end
# To generate Sample and their ClusterNumber
cl1 <- clusters(g1)
tbl1 <- cbind( V(g1)$name, cl1$membership )
#class(tbl1)
tbl1 <- as.data.frame(tbl1)
colnames(tbl1) = c("sample", "Cluster")
tbl1 <- tbl1 %>% arrange(as.numeric(Cluster))
write.table(tbl1,file=paste0("Samples_ClusterNumber_sim_cutoff",sim_cutoff,".txt"),row.names=FALSE) # drops the rownames and write to text file
}
awk '{print FILENAME "\t" $0}' Samples*.txt | sed -e 's/Samples_ClusterNumber_sim_cutoff//g' -e 's/.txt//g' | grep -v 'sample' | sed 's/"//g' | awk '{print $1 "\t" $2 "\t" $3}' >Combined_cutoff_0.8to1_sample_clusternumber.tab
cutoff_sample_clusternumber_df <- data.table::fread("Combined_cutoff_0.8to1_sample_clusternumber.tab", header= FALSE, sep = "\t")
#head(cutoff_sample_clusternumber_df)
colnames(cutoff_sample_clusternumber_df) <- c("Cutoff","Plasmid","ClusterNumber")
Cutoff_PlasmidCount <- cutoff_sample_clusternumber_df %>%
count(Cutoff) %>%
as.data.frame() %>%
rename(PlasmidCount=n)
Cutoff_ClusterCount <-cutoff_sample_clusternumber_df %>%
group_by(Cutoff) %>%
summarise(max = max(ClusterNumber, na.rm=TRUE)) %>%
as.data.frame() %>%
rename(ClusterCount=max)
Cutoff_PlasmidCount_ClusterCount <- left_join(Cutoff_PlasmidCount, Cutoff_ClusterCount, by="Cutoff")
Cutoff_PlasmidCount_ClusterCount %>%
kbl(caption = "Similarity Cutoff, Plasmid Counts and ClusterCounts for each cutoff") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Cutoff | PlasmidCount | ClusterCount |
|---|---|---|
| 0.80 | 1010 | 40 |
| 0.81 | 1010 | 40 |
| 0.82 | 1009 | 40 |
| 0.83 | 1009 | 41 |
| 0.84 | 1009 | 41 |
| 0.85 | 1009 | 41 |
| 0.86 | 1006 | 40 |
| 0.87 | 1005 | 41 |
| 0.88 | 1005 | 42 |
| 0.89 | 1004 | 42 |
| 0.90 | 1001 | 43 |
| 0.91 | 999 | 43 |
| 0.92 | 997 | 45 |
| 0.93 | 995 | 44 |
| 0.94 | 990 | 46 |
| 0.95 | 977 | 47 |
| 0.96 | 962 | 45 |
| 0.97 | 955 | 50 |
| 0.98 | 949 | 52 |
| 0.99 | 931 | 58 |
| 1.00 | 468 | 74 |
Cutoff_PlasmidCount_ClusterCount %>%
ggplot(aes(PlasmidCount,ClusterCount), color=Cutoff) +
geom_point(color="blue", size=1) +
geom_label_repel(aes(label = Cutoff),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
Cutoff_PlasmidCount_ClusterCount %>%
filter(Cutoff!=1) %>%
ggplot(aes(PlasmidCount,ClusterCount), color=Cutoff) +
geom_point(color="blue", size=1) +
scale_x_continuous(breaks=seq(920, 1020, 10)) +
geom_label_repel(aes(label = Cutoff),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
CP_Plasmids_meta_ClusterNumber_cutoff0.9_df <- data.table::fread("Closed_CP_Plasmids_metadata_07102022.tab", header= TRUE, sep = "\t")
#head(CP_Plasmids_meta_ClusterNumber_cutoff0.9_df)
Clusters_info_df <- CP_Plasmids_meta_ClusterNumber_cutoff0.9_df %>%
select(ClusterNumber_based_on_0.9_cutoff, Inc_group_regrouped, Relaxase_type_corrected, Length,ENT_ID,PID) %>%
group_by(ClusterNumber_based_on_0.9_cutoff) %>%
summarise(Inc_group_regrouped_collapse=paste(unique(Inc_group_regrouped),collapse='#'),
Relaxase_type_corrected_collapse=paste(unique(Relaxase_type_corrected),collapse='#'),
IsolateCount=n(),
PatientCount = n_distinct(PID),
PlasmidLength_Min = min(Length),
PlasmidLength_Max = max(Length),
) %>%
rename(ClusterNum=ClusterNumber_based_on_0.9_cutoff,
Inc_Group=Inc_group_regrouped_collapse,
Relaxase=Relaxase_type_corrected_collapse)
Clusters_info_df %>%
kbl(caption = "Plasmid Clusters information table") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
column_spec(1, width = "1px")
| ClusterNum | Inc_Group | Relaxase | IsolateCount | PatientCount | PlasmidLength_Min | PlasmidLength_Max |
|---|---|---|---|---|---|---|
| 1 | IncU |
|
7 | 7 | 7580 | 7580 |
| 10 | IncH | MOBH | 3 | 3 | 306281 | 315533 |
| 11 | IncR |
|
12 | 10 | 74789 | 78200 |
| 12 | IncN | MOBF | 2 | 1 | 59471 | 59567 |
| 13 | IncF | MOBP | 4 | 4 | 135212 | 135220 |
| 14 | IncL/M | MOBP | 4 | 3 | 67445 | 71114 |
| 15 | IncF | MOBF | 4 | 4 | 173234 | 180817 |
| 16 | IncA/C2 | MOBH,MOBH | 34 | 27 | 174317 | 176865 |
| 17 | IncN | MOBF | 4 | 4 | 49992 | 50752 |
| 18 | IncH | MOBH | 3 | 3 | 310749 | 312015 |
| 19 | IncL/M | MOBP | 33 | 23 | 63589 | 69945 |
| 2 | IncU#ColRNAI_all | MOBP | 405 | 203 | 67304 | 80977 |
| 20 | IncF | MOBC | 3 | 2 | 108331 | 108331 |
| 21 | IncA/C2_IncF | MOBH,MOBH | 4 | 1 | 173887 | 173888 |
| 22 | IncH | MOBH | 2 | 2 | 278408 | 291650 |
| 23 | IncA/C2 | MOBH,MOBH | 2 | 2 | 146179 | 149464 |
| 24 | IncX3 | MOBP | 7 | 6 | 51479 | 51503 |
| 25 | IncF | MOBF | 8 | 7 | 103933 | 112627 |
| 26 | rep_cluster_1195 | MOBP | 15 | 11 | 6139 | 12279 |
| 27 | IncF | MOBF | 2 | 2 | 152661 | 161498 |
| 28 | IncF | MOBF | 3 | 3 | 118971 | 122714 |
| 29 | IncF | MOBF | 2 | 2 | 113639 | 113641 |
| 3 | IncU |
|
2 | 1 | 21549 | 21549 |
| 30 | ColRNAI_all | MOBF,MOBP | 2 | 1 | 142050 | 142353 |
| 31 | IncA/C2 | MOBH,MOBH | 4 | 4 | 160819 | 164596 |
| 32 | IncU | MOBP | 25 | 16 | 78125 | 99518 |
| 33 | IncF_combinations | MOBF | 2 | 1 | 128051 | 128105 |
| 34 | IncF_IncU_IncX | MOBF,MOBF | 7 | 5 | 121423 | 123974 |
| 35 | IncN_IncU | MOBF | 8 | 6 | 70461 | 76006 |
| 36 | IncF_combinations | MOBF | 8 | 5 | 111676 | 112911 |
| 37 | IncL/M |
|
4 | 4 | 50138 | 50138 |
| 38 | IncF_combinations | MOBF | 2 | 2 | 177751 | 198148 |
| 39 | IncA/C2 |
|
2 | 1 | 88057 | 88057 |
| 4 | IncF_combinations | MOBF | 2 | 2 | 105819 | 105819 |
| 40 | IncN_IncU_IncX | MOBP | 3 | 1 | 97507 | 97513 |
| 41 | IncF | MOBF | 2 | 2 | 30320 | 60013 |
| 42 | IncF | MOBF | 4 | 3 | 63637 | 93113 |
| 43 | IncH | MOBH | 4 | 3 | 266420 | 300349 |
| 5 | IncL/M | MOBP | 28 | 21 | 82120 | 91643 |
| 6 | IncN | MOBF | 286 | 211 | 38383 | 45284 |
| 7 | IncL/M | MOBP | 6 | 6 | 82759 | 91289 |
| 8 | IncA/C2 | MOBH,MOBH | 19 | 12 | 117950 | 127481 |
| 9 | IncX3 | MOBP | 18 | 16 | 45048 | 46161 |
| Singleton | IncF_IncU#IncF#IncX3#IncA/C2_IncF#IncH#IncA/C2#rep_cluster_1195#IncF_combinations#IncN#ColRNAI_all#IncN_IncU#IncU_IncX#IncU#IncA/C2_IncN#IncN_IncU_IncX#IncI_IncU#IncR | MOBF#-#MOBF,MOBH,MOBH#MOBH,MOBH#MOBH#MOBP#MOBF,MOBF#MOBP,MOBP#MOBF,MOBH#MOBC | 70 | 62 | 6494 | 476834 |
#write.table(Clusters_info_df,file="Clusters_info_datatable.txt",sep = "\t", row.names=FALSE) # drops the rownames and write to text file
We are trying to check how the plasmid is persistent across the year whereas the clonal cluster outbreaks are sporadic and short-lived.
library(lubridate)
library(ggplot2)
library(forcats)
library(glue)
Phylodynamics_staticplot <- function(plot_title,clusterNum){
#View(meta)
# Subsetting to a smaller set for ease of simplicity - selecting OXA-48 Plasmids records with n=41 rows
subset_clusterNum <- Closed_CP_Plasmids_metadata_df %>%
filter(ClusterNumber_based_on_0.9_cutoff==clusterNum) %>% # Although the variable number
mutate(Unique_Plasmid_Cluster = if_else(is.na(Clonal_cluster),paste0("Singleton_", row_number()), as.character(Clonal_cluster))) %>% # assign unique id for NA values
group_by(Unique_Plasmid_Cluster) %>%
mutate(group_id = cur_group_id())
subset_clusterNum$Date_of_culture <- as.Date(subset_clusterNum$Date_of_culture, format = "%d/%m/%Y")
#View(subset_clusterNum)
pd1 <- subset_clusterNum %>%
ungroup() %>%
mutate(Unique_Plasmid_Cluster = forcats::fct_reorder(Unique_Plasmid_Cluster, Date_of_culture, min)) %>%
ggplot(aes(Date_of_culture, Unique_Plasmid_Cluster,color = Lab_Species, shape= factor(Hospital))) +
geom_line(aes(group=Unique_Plasmid_Cluster), color='black') +
geom_point(size=3) +
ggtitle(plot_title) +
ylab("Cluster") + xlab("Date") +
theme_bw() +
theme(
panel.grid.major = element_blank(),
legend.position = "right",
legend.key=element_rect(fill='gray96'),
legend.title =element_text(size=10),
text=element_text(size=12),
axis.title.x = element_text(vjust = 0, size = 11),
axis.title.y = element_text(vjust = 2, size = 11),
axis.text.x = element_text(angle = 90, hjust = 1, size = 9),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
pd1
}
Phylodynamics_dynamicplot <- function(plot_title,clusterNum){
#View(meta)
# Subsetting to a smaller set for ease of simplicity - selecting OXA-48 Plasmids records with n=41 rows
subset_clusterNum <- Closed_CP_Plasmids_metadata_df %>%
filter(ClusterNumber_based_on_0.9_cutoff==clusterNum) %>% # Although the variable number
mutate(Unique_Plasmid_Cluster = if_else(is.na(Clonal_cluster),paste0("Singleton_", row_number()), as.character(Clonal_cluster))) %>% # assign unique id for NA values
group_by(Unique_Plasmid_Cluster) %>%
mutate(group_id = cur_group_id())
subset_clusterNum$Date_of_culture <- as.Date(subset_clusterNum$Date_of_culture, format = "%d/%m/%Y")
#View(subset_clusterNum)
pd1 <- subset_clusterNum %>%
ungroup() %>%
mutate(Unique_Plasmid_Cluster = forcats::fct_reorder(Unique_Plasmid_Cluster, Date_of_culture, min)) %>%
ggplot(aes(Date_of_culture, Unique_Plasmid_Cluster,color = Lab_Species, shape= factor(Hospital))) +
geom_line(aes(group=Unique_Plasmid_Cluster), color='black') +
geom_point(size=3) +
ggtitle(plot_title) +
ylab("Cluster") + xlab("Date") +
theme_bw() +
theme(
panel.grid.major = element_blank(),
legend.position = "right",
legend.key=element_rect(fill='gray96'),
legend.title =element_text(size=10),
text=element_text(size=12),
axis.title.x = element_text(vjust = 0, size = 11),
axis.title.y = element_text(vjust = 2, size = 11),
axis.text.x = element_text(angle = 90, hjust = 1, size = 9),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
#pd1
pd1_plotly <- plotly::ggplotly(pd1)
pd1_plotly
}
Phylodynamics_staticplot(plot_title="Phylodynamics of blaNDM-1 gene tranmission",clusterNum=6)
Phylodynamics_staticplot(plot_title="Phylodynamics of blaKPC-2 gene tranmission",clusterNum=2)
Phylodynamics_staticplot(plot_title="Phylodynamics of blaOXA-181 gene tranmission",clusterNum=16)
Phylodynamics_staticplot(plot_title="Phylodynamics of blaOXA-48 gene tranmission",clusterNum=19)
Phylodynamics_staticplot(plot_title="Phylodynamics of blaNDM-1 gene tranmission",clusterNum=5)
Phylodynamics_dynamicplot(plot_title="Phylodynamics of blaNDM-1 gene tranmission",clusterNum=6)
Phylodynamics_dynamicplot(plot_title="Phylodynamics of blaKPC-2 gene tranmission",clusterNum=2)
Phylodynamics_dynamicplot(plot_title="Phylodynamics of blaOXA-181 gene tranmission",clusterNum=16)
Phylodynamics_dynamicplot(plot_title="Phylodynamics of blaOXA-48 gene tranmission",clusterNum=19)
Phylodynamics_dynamicplot(plot_title="Phylodynamics of blaNDM-1 gene tranmission",clusterNum=5)
Click code in the right side to see the code
List <- c(6,2,16,19,5)
for (clusterNum in List) {
# Find the predominant CP gene in the cluster
cpgene <- Closed_CP_Plasmids_metadata_df %>%
filter(ClusterNumber_based_on_0.9_cutoff==clusterNum) %>%
select(CP_Gene) %>%
table() %>%
stack() %>%
arrange(values) %>%
tail(1) %>%
pull(ind)
# Generate Plasmid list for each cluster
Plasmid_listdf <- Closed_CP_Plasmids_metadata_df %>%
filter(ClusterNumber_based_on_0.9_cutoff==clusterNum) %>%
select(Plasmid_name_filename) %>%
mutate(Plasmid_name_filename=stringr::str_replace(Plasmid_name_filename, ".fasta", ""))
path <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/"
prefix.name <- paste0("ClusterNumber_",clusterNum,"_",cpgene)
dir.create(paste0(path,prefix.name))
# Write to a file
write.table(Plasmid_listdf,
file=paste0(path,prefix.name,"/",prefix.name,"_plasmids.list"),
row.names=FALSE,
col.names = FALSE,
quote = FALSE) # drops the rownames and removes headers
}
# In Bash
cd /data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020
time bash runMe_Analysis.sh
In this section we draw the phylogenetic SNP tree of plasmids, SNPs in plasmid core genome, Gene Presence Absence Heatmap, Species,ST and Hospital Information and Plasmid lengths in different layers for each of the plasmid cluster
library(ggtree)
library(plotly)
library(ggplotlyExtra)
library(ggtreeExtra)
library(tidyverse)
library(glue)
Plasmid_SNPtree_SNPlocations_Heatmap <- function(location, treeInput, modifiedVCF,genePresAbsCSV, genePresAbsTAB, plot_title,
GenePA_offset,SpeciesST_offset,Plasmidlen_offset,SNPs_offset,Hospital_offset,
GenePA_pwidth,SpeciesST_pwidth,Plasmidlen_pwidth,SNPs_pwidth,Hospital_pwidth) {
# Load the modified vcf file
vcf <- data.table::fread(file = modifiedVCF, fill = TRUE, header = TRUE, sep = "\t", check.names = FALSE) # need input
vcf_reqcols <- vcf %>% select(-"CHROM",-"ID",-"ALT",-"QUAL",-"FILTER",-"INFO",-"FORMAT")
# Extracting data for SNP plot
snp_data_ggtree <- reshape2::melt(vcf_reqcols, id=c("POS","REF"),variable.name = "Plasmid", value.name = "BASE") %>%
filter(REF != BASE) %>%
filter(BASE != "-") %>%
filter(BASE != "N") %>%
select(Plasmid,POS) %>%
arrange(Plasmid,POS)
# Add metadata
meta <- read.table("/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_1071_PCA_plot/Closed_CP_Plasmids_1071_complete_v3_Plasmid_meta_for_gubbins.csv", sep = ",", header = TRUE)
# Mutate a new data column with both species and ST
meta <- meta %>% mutate(Species_ST=paste0(Lab_Species,"_ST",ST))
# To load and add annotation to the genes since the roary only puts group_names
heatmapData <- read.table(genePresAbsTAB, header = TRUE, check.names = FALSE, sep = "\t")
GeneAnnot_mat <- read.csv(genePresAbsCSV, header = TRUE, check.names = FALSE, sep = ",")
heatmapData2 <- GeneAnnot_mat %>%
select(Gene, Annotation) %>%
right_join(heatmapData, by = 'Gene') %>% unite(Gene_Annotation, c("Gene", "Annotation"))
write.csv(heatmapData2,file = glue('{location}geneAnnotation_presence_absence_intermediate1.csv'))
tmp <- as.data.frame(t(heatmapData2[,-1]))
colnames(tmp) <- heatmapData2$Gene
write.csv(tmp,file = glue('{location}geneAnnotation_presence_absence_intermediate2.csv'))
# For geom_fruit it is easier If I have long format, so generating tmp_long dataframe
tmp_long <- tmp %>%
tibble::rownames_to_column("fasta") %>%
reshape2::melt(variable = "Gene", value.name = "Pres_Abs")
# Load the postgubbins tree
thetree <- read.tree(glue('{location}postGubbins.final_tree.tre')) # need input
q <- ggtree(thetree, colour = "orange", ladderize = FALSE, right = FALSE, size = 0.25) +
geom_tippoint(size = 2,shape=1, color="red") +
geom_rootpoint() +
theme_tree("white") +
geom_treescale() +
labs(title=glue('{plot_title}')) # need input
# Plotting and adding layers using geom_fruit
# Plot1 Gene Presence Absence Plot
qq1 <- q + geom_fruit(data=tmp_long,geom=geom_tile,
mapping = aes(y=fasta, x=Gene, fill=factor(Pres_Abs)),
# color = "white",
offset=GenePA_offset,pwidth=GenePA_pwidth,
axis.params = list(axis="x", title = "Gene Presence Absence Matrix", title.size = 3, text.size=2,
vjust=0.5, line.size=0, hjust = 1, text.angle=90),
# grid.params=list(color="white")
) +
scale_fill_manual(values=c("1"="#0000FF","0"="#FFA500"),
guide=guide_legend(ncol = 1, order=1,title.position = "top")) +
labs(fill="Gene Presence/Absence") +
theme(legend.position= 'bottom', plot.title = element_text(size=15, hjust = 0.5))
#qq1
# Plot2 Species and ST Layer
qq2 <- qq1 + ggnewscale::new_scale_fill() +
geom_fruit(data=meta,geom=geom_tile,
mapping = aes(y=Plasmid,x=Species_ST,fill=Lab_Species),
offset=SpeciesST_offset,pwidth=SpeciesST_pwidth,
axis.params = list(axis="x", title = "Species and ST", title.size = 3, text.size=2,
vjust=0.5, line.size=0, hjust = 1, text.angle=90),grid.params=list()) +
scale_fill_manual(values=c("#000000", "#E69F00", "#56B4E9", "#009E73","#F0E442", "#0072B2", "#D55E00", "#CC79A7",
"#00AFBB", "#E7B800", "#FC4E07","#4dbfbc"),
guide = guide_legend(ncol=2,order = 2,title.position = "top")) +
labs(fill="Species")
#qq2
# Plot3 Plasmid Lengths layer
qq3 <- qq2 + ggnewscale::new_scale_fill() +
geom_fruit(data=meta,geom=geom_point,
mapping = aes(y=Plasmid,x=Length,color="Plasmid Length"),shape=0,
offset=Plasmidlen_offset,pwidth=Plasmidlen_pwidth,
axis.params = list(axis="x", title = "Plasmid Lengths (bases)", title.size = 3, text.size=2,
vjust=0.5, line.size=0, hjust = 1, text.angle=90),grid.params=list()) +
scale_color_manual(values=c("blue","#0a7a68"),guide = guide_legend(order = 3,title.position = "top"))
#qq3
# Plot3 SNPs in Plasmid Core Genome layer
qq4 <- qq3 + ggnewscale::new_scale_fill() +
geom_fruit(data=snp_data_ggtree,geom=geom_point,
mapping = aes(y=Plasmid,x=POS, color="SNP"),shape=1,
offset=SNPs_offset,pwidth=SNPs_pwidth,#,GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
axis.params = list(axis="x", title = "SNPs in core genome (bases)", title.size = 3, text.size=2,
vjust=0.5, line.size=0, hjust = 1, text.angle=90),grid.params=list()) +
scale_color_manual(values=c("blue","#00AFBB"),
guide=guide_legend(ncol = 1, order=4,title.position = "top")) +
labs(color="Scatter Points")
#qq4
# Plot5 Hospital Information layer
qq5 <- qq4 + ggnewscale::new_scale_fill() +
geom_fruit(data=meta,geom=geom_tile,
mapping = aes(y=Plasmid,fill=factor(Hospital)),
offset=Hospital_offset,pwidth=Hospital_pwidth,
axis.params = list(axis="x", title = "Hospital", title.size = 3, text.size=2,
vjust=0.5, line.size=0, hjust = 1, text.angle=90)) +
scale_fill_manual(values = c('red', 'blue', 'green','orange', "#00AFBB","#000000"),
guide = guide_legend(ncol=1, order = 5,title.position = "top")) +
labs(fill="Hospital")
qq6 <- qq5 + vexpand(.4, -1) + vexpand(.1,1)
#
qq6
}
#Cluster 6 blaNDM-1 plasmids
folderloc <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/ClusterNumber_6_blaNDM1/gff/CoreGenome_AND_gubbins/"
Plasmid_SNPtree_SNPlocations_Heatmap(
location=folderloc,
treeInput=glue('{folderloc}postGubbins.final_tree.tre'),
modifiedVCF=glue('{folderloc}modified.vcf'),
genePresAbsCSV=glue('{folderloc}gene_presence_absence.csv'),
genePresAbsTAB=glue('{folderloc}gene_presence_absence.Rtab'),
plot_title="Cluster 6 blaNDM-1 plasmids",
GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
GenePA_pwidth=2,SpeciesST_pwidth=3,Plasmidlen_pwidth=1,SNPs_pwidth=1,Hospital_pwidth=0.07
)
# Cluster 5 blaNDM-1 plasmids
folderloc <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/ClusterNumber_5_blaNDM1/gff/CoreGenome_AND_gubbins/"
Plasmid_SNPtree_SNPlocations_Heatmap(
location=folderloc,
treeInput=glue('{folderloc}postGubbins.final_tree.tre'),
modifiedVCF=glue('{folderloc}modified.vcf'),
genePresAbsCSV=glue('{folderloc}gene_presence_absence.csv'),
genePresAbsTAB=glue('{folderloc}gene_presence_absence.Rtab'),
plot_title="Cluster 5 blaNDM-1 plasmids",
GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
GenePA_pwidth=3,SpeciesST_pwidth=2,Plasmidlen_pwidth=1,SNPs_pwidth=1,Hospital_pwidth=0.07
)
#Cluster 2 blaKPC-2 plasmids
folderloc <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/ClusterNumber_2_blaKPC2/gff/CoreGenome_AND_gubbins/"
Plasmid_SNPtree_SNPlocations_Heatmap(
location=folderloc,
treeInput=glue('{folderloc}postGubbins.final_tree.tre'),
modifiedVCF=glue('{folderloc}modified.vcf'),
genePresAbsCSV=glue('{folderloc}gene_presence_absence.csv'),
genePresAbsTAB=glue('{folderloc}gene_presence_absence.Rtab'),
plot_title="Cluster 2 blaKPC-2 plasmids",
GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
GenePA_pwidth=3,SpeciesST_pwidth=2,Plasmidlen_pwidth=1,SNPs_pwidth=1,Hospital_pwidth=0.07
)
#Cluster 16 blaOXA-181 plasmids
folderloc <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/ClusterNumber_16_blaOXA181/gff/CoreGenome_AND_gubbins/"
Plasmid_SNPtree_SNPlocations_Heatmap(
location=folderloc,
treeInput=glue('{folderloc}postGubbins.final_tree.tre'),
modifiedVCF=glue('{folderloc}modified.vcf'),
genePresAbsCSV=glue('{folderloc}gene_presence_absence.csv'),
genePresAbsTAB=glue('{folderloc}gene_presence_absence.Rtab'),
plot_title="Cluster 16 blaOXA-181 plasmids",
GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
GenePA_pwidth=3,SpeciesST_pwidth=1,Plasmidlen_pwidth=1,SNPs_pwidth=1,Hospital_pwidth=0.07
)
#Cluster 19 blaOXA-48 plasmids
folderloc <- "/data02/Analysis/Projects/2_CPE_Transmission/Plasmid_roary/Plasmids_Analysis_1071_Kmer_plot_11102020/ClusterNumber_19_blaOXA48/gff/CoreGenome_AND_gubbins/"
Plasmid_SNPtree_SNPlocations_Heatmap(
location=folderloc,
treeInput=glue('{folderloc}postGubbins.final_tree.tre'),
modifiedVCF=glue('{folderloc}modified.vcf'),
genePresAbsCSV=glue('{folderloc}gene_presence_absence.csv'),
genePresAbsTAB=glue('{folderloc}gene_presence_absence.Rtab'),
plot_title="Cluster 19 blaOXA-48 plasmids",
GenePA_offset=0.1,SpeciesST_offset=0.1,Plasmidlen_offset=0.2,SNPs_offset=0.25,Hospital_offset=0.25,
GenePA_pwidth=3,SpeciesST_pwidth=1,Plasmidlen_pwidth=1,SNPs_pwidth=1,Hospital_pwidth=0.07
)